#################################################################
# Minimal NER for parliamentary candidates
#################################################################
# Content
#################################################################
# Dependencies
# Load data and basic data handling
# Bash-based NER
## Preprocessing 
## Bash evaluation
# Minified R-based NER
## Preprocessig of BASH-based NER annotated corpus
## Run minified R-based NER
# Consolidate corpus
#################################################################
rm(list=ls())
#################################################################
# Dependencies
#################################################################
# global
library(dplyr)
library(stringi)
library(stringr)
library(pbapply)
library(pbmcapply)
library(magrittr)
library(stringr)
library(readr)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

set.seed(123)

# load text data
txt <- readRDS('../../data/smd/complete_data/smd_all_curated_classified_sentiment.RDS')

txt %<>% mutate(id=sprintf(paste0('%0',nchar(as.numeric(nrow(txt))), 'd'), row_number()))

df <- read.csv('../../support/candidates-2019/00-Named_Entity_List_withID.csv', stringsAsFactors = F)

.identifier <- df$name.regex %>% setNames(., df$id)
head(.identifier, n = 100)

#################################################################
# Bash-based NER
#################################################################
## Preprocessing texts
######################################
# write out regex
write.table(.identifier, file='../../support/candidates-2019/01-regex-SMD.txt', row.names = F, col.names = F, quote=F)
# split text by sentence
nrow(txt)
txt %<>% mutate(tx_new=gsub('(?<=[a-z])\\.(?=[A-Z\\(])', '. ', tx, perl = T))
filesList <- gsub('\\.txt', '', list.files('../output/txt/'))
head(filesList)
toDo <- txt$id[!txt$id %in% filesList]
nrow(txt) == length(toDo) + length(filesList)

system(paste("mkdir", paste0('.../../data/support/txt_', Sys.Date())))
if(length(toDo)>0){
  pbmclapply(1:nrow(txt), function(i){
    X <- txt$tx_new[i]
    temp <- 
      strsplit(X, '((?<=.)((?<!Dr|phil|bzw|z\\.b|etc|usw|St|\\?|\\!|[0-9]|[A-Z]{1})(?=(\\.|\\!|\\?))))', perl=T) %>% 
      unlist(., recursive = F) %>% 
      paste0(., as.character(na.omit(str_extract(., '^(\\.|\\!|\\?)(.){1}')))) %>% 
      gsub('^[.!?»](.){1}(\\s)?', '', .) %>% 
      #gsub('^[.!?]+', '\\1', .) %>% 
      gsub('(\\.|\\!|\\?){2,}', '\\1', .)
    temp <- temp[unname(sapply(temp, nchar))>1]
    write.table(temp, 
                file= paste0('../output/', paste0('txt_', Sys.Date(), '/', txt$id[i], '.txt')), row.names = F, col.names = F, quote=F)
  }, mc.cores = 4)
  
}

#################################################################
## Bash evaluation
######################################
system(paste0('> ', '../../data/support/01-txt-SMD-Log', '.txt'))
# if you have parallel installed
setwd('../output/')
getwd()
t.bash.parallel <- 
  system.time(
    system(paste0('find txt_', Sys.Date(), '/. -type f | parallel --progress -L 100 -m -j 4 grep -C4 -E -H -f 01-regex.txt >> 01-txtLog_', Sys.Date(),'.txt'))
  )
t.bash.parallel

#################################################################
# Minified R-based NER
#################################################################
## Preprocessig of BASH-based NER annotated corpus
######################################
setwd('../../data/support/')
logPath <- paste0('01-txt-SMD-Log','.txt')
raw <- readLines(logPath) %>% 
  gsub('-{5,}', '', .) %>% 
  gsub('(txt\\-\\-)', 'txt\\-', ., perl=T)
head(raw)

#Check Date in Gsub!
split.raw <- strsplit(paste0(raw, collapse = ''), '(----)|(--)') %>% 
  unlist(recursive = F) %>% gsub(paste0("_",Sys.Date()), "", .)
head(split.raw)

txt <- split.raw  %>% 
  gsub('(\\-)?txt\\/\\.\\/[0-9]+\\.txt(\\-|\\:)?', '', .) %>% 
  trimws
head(txt)

doc.id <- split.raw %>% 
  sapply(., function(x) str_extract(x, '[0-9]+')[1]) %>% 
  unname
head(doc.id)

if(length(txt)==length(doc.id)){
  corpus <- tibble(doc.id, txt)
}else{print('Problem with dimensions!')}

corpus <- corpus[!is.na(as.numeric(corpus$doc.id)),]
gc()

matches <- pbmclapply(corpus$txt, function(sentc){
  np <- unlist(sapply(.identifier, function(x) grep(x, sentc, value = T)))
  if(!identical(np, character(0))){
    return(paste0(names(np), collapse = ';'))
  }else{
    return(NA)
  }
}, mc.cores = 7) %>% 
  unlist %>% 
  unname

corpus %<>% 
  mutate(person.id=matches)
write_rds(corpus, file=paste0('../../data/2019/01-minified-corpus-NER_kandidaten_SMD.RDS'))


